# Bibliotecas
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
# Garantindo que todas colunas serão mostradas
pd.set_option('display.max_columns', None)
# importando os dado de treino
range1 = [i for i in range(2,82)]
usecols = range1
ames_train = pd.read_csv ('ames_train.csv', sep=';', usecols=usecols)
# price column as the first column
ames_train = ames_train[['price','area','MS.SubClass','MS.Zoning','Lot.Frontage','Lot.Area','Street','Alley','Lot.Shape','Land.Contour','Utilities','Lot.Config','Land.Slope','Neighborhood','Condition.1','Condition.2','Bldg.Type','House.Style','Overall.Qual','Overall.Cond','Year.Built','Year.Remod.Add','Roof.Style','Roof.Matl','Exterior.1st','Exterior.2nd','Mas.Vnr.Type','Mas.Vnr.Area','Exter.Qual','Exter.Cond','Foundation','Bsmt.Qual','Bsmt.Cond','Bsmt.Exposure','BsmtFin.Type.1','BsmtFin.SF.1','BsmtFin.Type.2','BsmtFin.SF.2','Bsmt.Unf.SF','Total.Bsmt.SF','Heating','Heating.QC','Central.Air','Electrical','X1st.Flr.SF','X2nd.Flr.SF','Low.Qual.Fin.SF','Bsmt.Full.Bath','Bsmt.Half.Bath','Full.Bath','Half.Bath','Bedroom.AbvGr','Kitchen.AbvGr','Kitchen.Qual','TotRms.AbvGrd','Functional','Fireplaces','Fireplace.Qu','Garage.Type','Garage.Yr.Blt','Garage.Finish','Garage.Cars','Garage.Area','Garage.Qual','Garage.Cond','Paved.Drive','Wood.Deck.SF','Open.Porch.SF','Enclosed.Porch','X3Ssn.Porch','Screen.Porch','Pool.Area','Pool.QC','Fence','Misc.Feature','Misc.Val','Mo.Sold','Yr.Sold','Sale.Type','Sale.Condition']]
ames_train.info()
# Evaluating numeric data
ames_numeric = ames_train.select_dtypes(include=[np.number])
ames_numeric.dtypes
ames_numeric.describe()
print(ames_numeric.isnull().sum().to_string())
ames_train['Lot.Frontage'].describe()
ames_train['Mas.Vnr.Area'].describe()
ames_train['BsmtFin.SF.1'].describe()
ames_train['Bsmt.Full.Bath'].value_counts()
ames_train['Full.Bath'].value_counts()
Thats's odd
ames_train['Garage.Yr.Blt'].value_counts()
ames_train['Sale.Condition'].value_counts()
ames_train['Lot.Frontage'] = ames_train['Lot.Frontage'].fillna(ames_train['Lot.Frontage'].mean())
ames_train['Mas.Vnr.Area'] = ames_train['Mas.Vnr.Area'].fillna(ames_train['Mas.Vnr.Area'].mean())
ames_train['BsmtFin.SF.1'] = ames_train['BsmtFin.SF.1'].fillna(ames_train['BsmtFin.SF.1'].mean())
ames_train['BsmtFin.SF.2'] = ames_train['BsmtFin.SF.2'].fillna(ames_train['BsmtFin.SF.2'].mean())
ames_train['Bsmt.Unf.SF'] = ames_train['Bsmt.Unf.SF'].fillna(ames_train['Bsmt.Unf.SF'].mean())
ames_train['Total.Bsmt.SF'] = ames_train['Total.Bsmt.SF'].fillna(ames_train['Total.Bsmt.SF'].mean())
ames_train['Bsmt.Full.Bath'] = ames_train['Bsmt.Full.Bath'].fillna(0)
ames_train['Bsmt.Half.Bath'] = ames_train['Bsmt.Half.Bath'].fillna(0)
ames_train['Garage.Yr.Blt'] = ames_train['Garage.Yr.Blt'].fillna(0)
ames_train['Garage.Area'] = ames_train['Garage.Area'].fillna(0)
ames_train['Garage.Cars'] = ames_train['Garage.Cars'].fillna(0)
print(ames_train.isnull().sum().to_string())
ames_categorical = ames_train.select_dtypes(exclude=[np.number])
ames_categorical.describe()
print(ames_categorical.isnull().sum().to_string())
ames_train['Alley'] = ames_train['Alley'].replace({np.nan: 'No Alley'})
ames_train['Mas.Vnr.Type'] = ames_train['Mas.Vnr.Type'].replace({np.nan: 'Mas.Vnr.Type'})
ames_train['Bsmt.Qual'] = ames_train['Bsmt.Qual'].replace({np.nan: 'No Bsmt'})
ames_train['Bsmt.Cond'] = ames_train['Bsmt.Cond'].replace({np.nan: 'No Bsmt'})
ames_train['Bsmt.Exposure'] = ames_train['Bsmt.Exposure'].replace({np.nan: 'No Bsmt'})
ames_train['BsmtFin.Type.1'] = ames_train['BsmtFin.Type.1'].replace({np.nan: 'No Bsmt'})
ames_train['BsmtFin.Type.2'] = ames_train['BsmtFin.Type.2'].replace({np.nan: 'No Bsmt'})
ames_train['Garage.Type'] = ames_train['Garage.Type'].replace({np.nan: 'No Garage'})
ames_train['Garage.Finish'] = ames_train['Garage.Finish'].replace({np.nan: 'No Garage'})
ames_train['Garage.Qual'] = ames_train['Garage.Qual'].replace({np.nan: 'No Garage'})
ames_train['Garage.Cond'] = ames_train['Garage.Cond'].replace({np.nan: 'No Garage'})
ames_train['Pool.QC'] = ames_train['Pool.QC'].replace({np.nan: 'No Pool'})
ames_train['Fence'] = ames_train['Fence'].replace({np.nan: 'No Fence'})
ames_train['Misc.Feature'] = ames_train['Misc.Feature'].replace({np.nan: 'No Misc.Feature'})
ames_train['Fireplace.Qu'] = ames_train['Fireplace.Qu'].replace({np.nan: 'No Fireplace'})
print(ames_train.isnull().sum().to_string())
# Agora, com os dados limpos, vamos avaliar as correlações entre as variáveis numéricas
corr = ames_numeric.corr()
f, ax = plt.subplots(figsize=(36, 18))
# Escondendo os dados da parte de 'cima' da diagonal principal
mask = np.triu(np.ones_like(corr, dtype=bool))
# setando a uma paleta de cores divergentes para o mapa de calor
cmap = sns.diverging_palette(230, 20, as_cmap=True)
# Este é o único gráfico que não consigo fazer com plotly /=
sns.heatmap(corr, annot=True, mask = mask, cmap=cmap)
Some variables stand out for their high correlation with price. Area, as expected, is one of them and, looking closer, we see that there are several variables that make up the area of the property, later on they will be unified in order to simplify the data structure and avoid multicollinearities. Overall build quality has a very high correlation. It is a discrete variable and can even be considered a category. Year of construction is also a variable expected to be important in the composition of prices, since older houses tend to have a lower value per area due to depreciation, style and construction lau-out. The garage, whether it exists or not, will have a big impact as well. Although this variable falls under the area class, described above, I will deal with it separately in the analysis.
ames_clean = ames_train[['price', 'area','X1st.Flr.SF','X2nd.Flr.SF','Low.Qual.Fin.SF', 'Total.Bsmt.SF', 'Year.Built', 'Overall.Qual','Neighborhood','Bldg.Type','Garage.Area','Garage.Type']]
ames_clean['Total.Area'] = ames_train['area']+ames_train['Total.Bsmt.SF']
ames_clean['PricePerSF'] = ames_clean['price']/ames_clean['Total.Area']
ames_clean = ames_clean[['price', 'Total.Area','PricePerSF', 'Year.Built','Garage.Area','Overall.Qual','Neighborhood','Bldg.Type','Garage.Type']]
ames_clean.head()
As a matter of fact, the first curiosity I had was whether property prices showed significant variations in the analyzed period. And, as can be seen in the chart below, it did not. In fact, what is observed is great stability in property prices, with their medians very close. The occurrence becomes especially curious if we remember that the observed period comprises the 2008 crisis that began precisely in the real estate sector. A factor that possibly partly explains such stability is that the city of Ames has practically 2/5 of its population employed at Iowa State University or government agencies, which suggests greater stability in per capita income. This information, however, would need to be further analyzed and is currently beyond the scope of this report.
df = px.data.tips()
fig = px.box(ames_train, x= 'Yr.Sold', y= "price", color = 'Yr.Sold')
fig.show()
We have a slightly skewed distribution to the left, suggesting that most of the trades were in a price range that goes up to around $150,000.
fig = px.histogram(ames_clean, x="price", nbins=50)
fig.show()
fig = px.histogram(ames_clean, x=ames_clean['PricePerSF'], nbins=30)
fig.show()
Evaluating the variables with the highest correlation (note that the area variable was created from the sum of the built areas) the result of the graphs gives us a view that was already expected a priori, but that allows us to observe the existence, or not, of outliers.
fig = make_subplots(rows=2, cols=2,
subplot_titles=('Area', 'Quality', 'Construction Year', 'Garage size'))
fig.add_trace(
go.Scatter(x = ames_clean['Total.Area'], y = ames_clean['price'], opacity=0.65,
mode="markers"
),
row=1, col=1
)
fig.add_trace(
go.Scatter(x = ames_clean['Overall.Qual'], y = ames_clean['price'], opacity=0.65,
mode="markers"
),
row=1, col=2
)
fig.add_trace(
go.Scatter(x = ames_clean['Year.Built'], y = ames_clean['price'], opacity=0.65,
mode="markers"
),
row=2, col=1
)
fig.add_trace(
go.Scatter(x = ames_clean['Garage.Area'], y = ames_clean['price'], opacity=0.65,
mode="markers"
),
row=2, col=2
)
fig.update_layout(height=800, width=800, title_text="Price relationships with explanatory variables")
fig.show()
Certainly, when we think about moving, one of the first things that comes to mind is the neighborhood or region of the property. Below we can see that this has a considerable impact on the selling price. While we have neighborhoods where you can't get anything for less than 170 thousand, in others, not even the most expensive properties reach this price.
df = px.data.tips()
fig = px.box(ames_clean, x= 'Neighborhood', y="price")
fig.show()
Looking at the average price per square foot in each neighborhood, it is possible to observe that the GrnHill neighborhood has the highest average price, which is also due to the small number of properties sold in the region.
ames_Neighborhood = (ames_clean.groupby('Neighborhood', as_index=True)['PricePerSF'].mean().sort_values(ascending= True))
#ames_Neighborhood['Neighborhood'] = ames_train['Neighborhood']
fig = px.bar(ames_Neighborhood, y= 'PricePerSF')
fig.show()
As the garage area showed a strong correlation with price, I decided to analyze whether the type of garage also exerts any relevant influence on property pricing. In general, the type of garage seems to positively influence the price when it is built into the house, probably because it offers more comfort and protection against the weather. On the other hand, as expected, the non-existence of a garage tends to be associated with properties sold at a lower price.
df = px.data.tips()
fig = px.box(ames_clean, x= 'Garage.Type', y= "price")
fig.show()
A good location is just as important as a spacious home or superior construction. This is clear in regions such as Stone Broken Road (StoneBr in the data set), which is close to parks, or Greensboro (Greens), which is close to the University. After controlling the location, factors such as size and type of garage, year of construction and quality of work are also of great importance.
# Importando os dois datasets e fazendo um merge a fim de fazer uma tratamento único nos dados
range1 = [i for i in range(2,82)]
#usecols = range1
amesTrain = pd.read_csv ('ames_train.csv', sep=';')
amesTest = pd.read_csv ('ames_test.csv', sep=';')
print(amesTrain.shape)
print(amesTest.shape)
amesTrain.head()
# Separando os ids de cada data set para posteriormente poder separar os dados para treino e teste
#amesTest['price'] = np.nan
ames_df = pd.concat([amesTrain, amesTest], axis=0)
trainPID = amesTrain['PID']
testPID = amesTest['PID']
ames_df = ames_df[['PID','price', 'area','X1st.Flr.SF','X2nd.Flr.SF','Low.Qual.Fin.SF', 'Total.Bsmt.SF', 'Year.Built', 'Overall.Qual','Full.Bath','TotRms.AbvGrd','Fireplaces','Neighborhood','Bldg.Type','Garage.Area','Garage.Type']]
# Tratando valores faltantes com o mesmo critério utilizado na EDA
ames_df['Total.Bsmt.SF'] = ames_df['Total.Bsmt.SF'].fillna(ames_df['Total.Bsmt.SF'].mean())
ames_df['Garage.Area'] = ames_df['Garage.Area'].fillna(0)
ames_df['Garage.Type'] = ames_df['Garage.Type'].replace({np.nan: 'No Garage'})
# Criando as variáveis de área total e preço por sf conforme feiot na EDA
ames_df['Total.Area'] = ames_df['area']+ames_df['Total.Bsmt.SF']
ames_df['PricePerSF'] = ames_df['price']/ames_df['Total.Area']
# Utilizarei na regressão um conjunto de dados menor que a totalidade pois temos muitas variáveis sem correlação
# e muitas variáveis categóricas relacionadas entre si.
# A excessão é a variável Neighborhood que demonstrou grande impacto na análise prévia
ames_df = ames_df[['PID','price', 'Total.Area','PricePerSF','Garage.Area', 'Year.Built', 'Overall.Qual','Full.Bath','TotRms.AbvGrd','Fireplaces', 'Neighborhood']]
ames_df.head()
#### encodando a variável categórica em númerica a fim de poder encaixá-la no modelo
ames_df[['Neighborhood']] = ames_df[['Neighborhood']].apply(pd.Categorical)
#ames_df.loc[ames_df['PID'] == testPID['PID'], 'price'] = np.nan
#ames_df.loc[ames_df['PID'] == testPID['PID']
#ames_df['price'] = np.where((ames_df.['PID'] == testPID['PID']),np.nan, ames_df.price, inplace=True)
#ames_df.loc[ames_df['PID'].isin(testPID), 'price'] = np.nan
ames_df["Neighborhood"] = ames_df["Neighborhood"].cat.codes
ames_df.head()
ames_df.info()
# Separando entre os conjuntos de variáveis explicativas e variável resposta
#,'Blmngtn','Blueste','BrDale','BrkSide','ClearCr','CollgCr','Crawfor','Edwards','Gilbert','Greens','GrnHill','IDOTRR','Landmrk',
#'MeadowV','Mitchel','NAmes' ,'NPkVill','NWAmes' ,'NoRidge','NridgHt','OldTown','SWISU','Sawyer' ,'SawyerW','Somerst','StoneBr','Timber','Veenker'
X = ames_df[['PID','Total.Area', 'Year.Built', 'Overall.Qual','Garage.Area','Full.Bath','Fireplaces','TotRms.AbvGrd','PricePerSF','Neighborhood']]
y = ames_df[['PID','price']]
# utilizando os ids fornecidos para filtrar os dados de treino e de teste
X_train = X.loc[X['PID'].isin(trainPID)]
y_train = y.loc[y['PID'].isin(trainPID)]
X_test = X.loc[X['PID'].isin(testPID)]
y_test = y.loc[y['PID'].isin(testPID)]
# removendo os ids para não impactarem na regressão
X_train.drop('PID', axis='columns', inplace=True)
y_train.drop('PID', axis='columns', inplace=True)
X_test.drop('PID', axis='columns', inplace=True)
y_test.drop('PID', axis='columns', inplace=True)
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
# fazendo a regressão nos dados de treino
regressor.fit(X_train, y_train)
# Previsão dos preços nos dados de teste
y_pred = regressor.predict(X_test)
# Verificando a performance do modelo
from sklearn.metrics import r2_score, mean_squared_error
# R2
print(f"R2 score: {r2_score(y_test, y_pred)}")
# Média dos quadrados dos erros
print(f"MSE score: {mean_squared_error(y_test, y_pred)}")
The R² shows us that the model seems to have a high explanatory capacity, 0.96, but the mean of the squares is very high, which indicates a high deviation between the predicted values and the values of the given data.